R Markdown

Data set source: [https://dcc.icgc.org/releases/release_28/Projects/BRCA-US]

x <- read.csv('exp_seq.BRCA-US.tsv.gz',sep = '\t', nrows = 2000, na.strings = '?')
str(x)
## 'data.frame':    2000 obs. of  22 variables:
##  $ icgc_donor_id           : chr  "DO1808" "DO1808" "DO1808" "DO1808" ...
##  $ project_code            : chr  "BRCA-US" "BRCA-US" "BRCA-US" "BRCA-US" ...
##  $ icgc_specimen_id        : chr  "SP3930" "SP3930" "SP3930" "SP3930" ...
##  $ icgc_sample_id          : chr  "SA42163" "SA42163" "SA42163" "SA42163" ...
##  $ submitted_sample_id     : chr  "TCGA-E2-A15C-01A-31R-A12D-07" "TCGA-E2-A15C-01A-31R-A12D-07" "TCGA-E2-A15C-01A-31R-A12D-07" "TCGA-E2-A15C-01A-31R-A12D-07" ...
##  $ analysis_id             : int  7768 7768 7768 7768 7768 7768 7768 7768 7768 7768 ...
##  $ gene_model              : chr  "GAF" "GAF" "GAF" "GAF" ...
##  $ gene_id                 : chr  "ALPL" "ALPK1" "ALPK2" "ALPK3" ...
##  $ normalized_read_count   : num  4.49e-06 2.03e-05 7.66e-07 3.50e-06 0.00 ...
##  $ raw_read_count          : int  278 2189 140 607 0 0 347 20 17 1291 ...
##  $ fold_change             : logi  NA NA NA NA NA NA ...
##  $ assembly_version        : chr  "GRCh37" "GRCh37" "GRCh37" "GRCh37" ...
##  $ platform                : chr  "Illumina HiSeq" "Illumina HiSeq" "Illumina HiSeq" "Illumina HiSeq" ...
##  $ total_read_count        : logi  NA NA NA NA NA NA ...
##  $ experimental_protocol   : chr  "RNASeqV2_RSEM_genes https://tcga-data.nci.nih.gov/tcgafiles/ftp_auth/distro_ftpusers/anonymous/tumor" "RNASeqV2_RSEM_genes https://tcga-data.nci.nih.gov/tcgafiles/ftp_auth/distro_ftpusers/anonymous/tumor" "RNASeqV2_RSEM_genes https://tcga-data.nci.nih.gov/tcgafiles/ftp_auth/distro_ftpusers/anonymous/tumor" "RNASeqV2_RSEM_genes https://tcga-data.nci.nih.gov/tcgafiles/ftp_auth/distro_ftpusers/anonymous/tumor" ...
##  $ alignment_algorithm     : logi  NA NA NA NA NA NA ...
##  $ normalization_algorithm : logi  NA NA NA NA NA NA ...
##  $ other_analysis_algorithm: logi  NA NA NA NA NA NA ...
##  $ sequencing_strategy     : chr  "RNA-Seq" "RNA-Seq" "RNA-Seq" "RNA-Seq" ...
##  $ raw_data_repository     : chr  "CGHub" "CGHub" "CGHub" "CGHub" ...
##  $ raw_data_accession      : chr  "1a77082d-f130-436b-9028-23c8dd9dc565" "1a77082d-f130-436b-9028-23c8dd9dc565" "1a77082d-f130-436b-9028-23c8dd9dc565" "1a77082d-f130-436b-9028-23c8dd9dc565" ...
##  $ reference_sample_type   : logi  NA NA NA NA NA NA ...

Dropping the columns with NA values or not relevant information

x <- subset(x, select = -c(fold_change,total_read_count,platform,raw_data_repository,experimental_protocol,alignment_algorithm,normalization_algorithm,other_analysis_algorithm,reference_sample_type))

x[is.na(x)] <- 0 # replacing NA values with a 0

str(x)
## 'data.frame':    2000 obs. of  13 variables:
##  $ icgc_donor_id        : chr  "DO1808" "DO1808" "DO1808" "DO1808" ...
##  $ project_code         : chr  "BRCA-US" "BRCA-US" "BRCA-US" "BRCA-US" ...
##  $ icgc_specimen_id     : chr  "SP3930" "SP3930" "SP3930" "SP3930" ...
##  $ icgc_sample_id       : chr  "SA42163" "SA42163" "SA42163" "SA42163" ...
##  $ submitted_sample_id  : chr  "TCGA-E2-A15C-01A-31R-A12D-07" "TCGA-E2-A15C-01A-31R-A12D-07" "TCGA-E2-A15C-01A-31R-A12D-07" "TCGA-E2-A15C-01A-31R-A12D-07" ...
##  $ analysis_id          : int  7768 7768 7768 7768 7768 7768 7768 7768 7768 7768 ...
##  $ gene_model           : chr  "GAF" "GAF" "GAF" "GAF" ...
##  $ gene_id              : chr  "ALPL" "ALPK1" "ALPK2" "ALPK3" ...
##  $ normalized_read_count: num  4.49e-06 2.03e-05 7.66e-07 3.50e-06 0.00 ...
##  $ raw_read_count       : int  278 2189 140 607 0 0 347 20 17 1291 ...
##  $ assembly_version     : chr  "GRCh37" "GRCh37" "GRCh37" "GRCh37" ...
##  $ sequencing_strategy  : chr  "RNA-Seq" "RNA-Seq" "RNA-Seq" "RNA-Seq" ...
##  $ raw_data_accession   : chr  "1a77082d-f130-436b-9028-23c8dd9dc565" "1a77082d-f130-436b-9028-23c8dd9dc565" "1a77082d-f130-436b-9028-23c8dd9dc565" "1a77082d-f130-436b-9028-23c8dd9dc565" ...

Clustering

It’s time to build the distance matrix. We use the euclidean distance method.

d <- dist(x, method = 'euclidean')
## Warning in dist(x, method = "euclidean"): NAs introduced by coercion

Hierarchial clustering

We proceed with average linkage method.

h <- hclust(d, method = 'average')

Looking at the dendogram

plot(h)

Next, we can cut the dendrogram in order to create the desired number of clusters.

cut_avg <- cutree(h, k=10)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
plot(h)
rect.hclust(h , k = 10, border = 2:6)
abline(h = 10, col = 'red')

Now we can see the three clusters enclosed in three different colored boxes.

suppressPackageStartupMessages(library(dendextend))
avg_dend_obj <- as.dendrogram(h)
avg_col_dend <- color_branches(h, h = 10)
plot(avg_col_dend)

Now we append the cluster results obtained back in the original dataframe under column name the cluster with mutate(), from the dplyr package and count how many observations were assigned to each cluster with the count() function.

suppressPackageStartupMessages(library(dplyr))
x_clusters <- mutate(x, cluster = cut_avg)
count(x_clusters,cluster)
##    cluster    n
## 1        1 1893
## 2        2   82
## 3        3   11
## 4        4    7
## 5        5    2
## 6        6    1
## 7        7    1
## 8        8    1
## 9        9    1
## 10      10    1

Visualizing some relationships of the data

suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(plotly))

ggplotly(ggplot(x_clusters, aes(x=gene_id, y = normalized_read_count, color = factor(cluster))) + geom_point())